import warnings
warnings.filterwarnings("ignore")
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# libaries to help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# Libraries to tune model, get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
#libraries to help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier)
from xgboost import XGBClassifier
# Loading the dataset
data = pd.read_csv("BankChurners.csv")
BankChurners=data.copy()
data.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
data.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | Unknown | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | Unknown | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 10127 non-null object 6 Marital_Status 10127 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
data.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 7 Marital_Status 4 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
"CLIENTNUM" is Client number which is unique identifier for the customer holding the account and has 10127 unique values which is the number of obserations in the dataset, we will be dropping this column as it won't be a useful feature
"Customer_Age" is Customer's Age and has only 45 unique values, that means that lot of customers are of the same age.
data.drop("CLIENTNUM", axis=1, inplace=True)
print(data.Attrition_Flag.value_counts())
print(data.Gender.value_counts())
print(data.Dependent_count.value_counts())
print(data.Education_Level.value_counts())
print(data.Marital_Status.value_counts())
print(data.Income_Category.value_counts())
print(data.Card_Category.value_counts())
print(data.Total_Relationship_Count.value_counts())
print(data.Months_Inactive_12_mon.value_counts())
print(data.Contacts_Count_12_mon.value_counts())
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 F 5358 M 4769 Name: Gender, dtype: int64 3 2732 2 2655 1 1838 4 1574 0 904 5 424 Name: Dependent_count, dtype: int64 Graduate 3128 High School 2013 Unknown 1519 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 Married 4687 Single 3943 Unknown 749 Divorced 748 Name: Marital_Status, dtype: int64 Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 Unknown 1112 $120K + 727 Name: Income_Category, dtype: int64 Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 3 2305 4 1912 5 1891 6 1866 2 1243 1 910 Name: Total_Relationship_Count, dtype: int64 3 3846 2 3282 1 2233 4 435 5 178 6 124 0 29 Name: Months_Inactive_12_mon, dtype: int64 3 3380 2 3227 1 1499 4 1392 0 399 5 176 6 54 Name: Contacts_Count_12_mon, dtype: int64
Attrition_Flag- has 2 unique values Existing Customer(83.9%) and Attrited Customer(16.06%). This is the dependent variable and has imbalanced classes.
Gender- has 2 unique values "F" (52.9%) and "M" (47.09%). Female percentage is slightly higher than Male percent but they are not highly imbalanced.
Dependent_count has 6 unique values that signify that each customer has either 0,1,2,3,4,5 or 6 dependents. 8.9% customers have 0 dependents, 18.1% customers have 1 dependent, 26.2% customers have 2 dependents, 26.9% customers have 3 dependents, 15.5% customers have 4 dependents and 4.2% customers have 5 dependents.
Education_Level- Most customers have a Graduate Degree, followed by High passouts. There are a very few customers who have a Doctorate or a Post-Graduate degree. There are some customers who are still in College and for 1519 customers Education Level is Unknown.
Marital_Status- Most of the customers of the Bank are Married followed by 3943 Single customers. For almost equal number of customers, the Marital Status is Unknown (749) or they are Divorced (748)
Income_Category- Most of the customers (3561) earn less than 40K annually, followed by people earning annually between 40k-60k (1790), 1535 of all customers annually earn between 80k-120k, 1402 customers annually earn between 60k-80k, 727 customers annually earn over 120K and for 1112 customer, the Annual Income is Unknown
Card_Category- 9436 customers have "Blue" credit card, 555 have "Silver" and 20 customers have "Platinum" Credit Card. Looks like the "Blue" Credit card is a starter credit card and is the one that most of the customers have.
Total_Relationship_Count- 2305 customers have 3 products from the bank, 1912 customers have 4 products from the bank, 1891 customers have 5 products from bank, 1866 customers have 6 products, 1243 customers have 2 products from the bank and 910 customers have just 1 product from the bank.
Months_Inactive_12_mon- 29 customers out of all have 0 inactive months, meaning that they use their credit card most regularly for transactions, 124 customers have not used their credit card for around 6 months, 178 customers have not used their credit card for 5 months, 435 customers have not used their credit card for around 4 months, 3846 customers have not used their credit card for around 3 months, 3282 customers have not used their credit card for around 2 months and 2233 customers have not used their credit card for 1 month.
Contacts_Count_12_mon- Out of all the customers, most of the customers have contacted the bank 3 (3380) or 2 (3227) times in last 12 months. However very less customers have contacted 5 (176) or 6 (54) times. This represents that probably Bank has a online self services for the products and most of the customers manage their accounts themselves with those self services.
data.shape
(10127, 20)
data.describe(include=['object']).T
| count | unique | top | freq | |
|---|---|---|---|---|
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 |
| Gender | 10127 | 2 | F | 5358 |
| Education_Level | 10127 | 7 | Graduate | 3128 |
| Marital_Status | 10127 | 4 | Married | 4687 |
| Income_Category | 10127 | 6 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Customer_Age | 10127.0 | 46.325960 | 8.016814 | 26.0 | 41.000 | 46.000 | 52.000 | 73.000 |
| Dependent_count | 10127.0 | 2.346203 | 1.298908 | 0.0 | 1.000 | 2.000 | 3.000 | 5.000 |
| Months_on_book | 10127.0 | 35.928409 | 7.986416 | 13.0 | 31.000 | 36.000 | 40.000 | 56.000 |
| Total_Relationship_Count | 10127.0 | 3.812580 | 1.554408 | 1.0 | 3.000 | 4.000 | 5.000 | 6.000 |
| Months_Inactive_12_mon | 10127.0 | 2.341167 | 1.010622 | 0.0 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.0 | 2.455317 | 1.106225 | 0.0 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.0 | 8631.953698 | 9088.776650 | 1438.3 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.0 | 1162.814061 | 814.987335 | 0.0 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.0 | 7469.139637 | 9090.685324 | 3.0 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 0.759941 | 0.219207 | 0.0 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.0 | 4404.086304 | 3397.129254 | 510.0 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.0 | 64.858695 | 23.472570 | 10.0 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 0.712222 | 0.238086 | 0.0 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.0 | 0.274894 | 0.275691 | 0.0 | 0.023 | 0.176 | 0.503 | 0.999 |
Customer_Age is almost normally distibuted with a slight right skewed. The range is from 26 years to 73 years.
Dependet_count is normally distributed. Even though the mean value is 2.3 its not possible to have 2.3 dependents, so we can round it off to 2. Customers have 0-5 dependents.
The average Period of relationship with the bank (Months_on_book) is 35.9 months which is almost equal to median value showing that distribution is almost normal.
Total_Relationship_Count shows that customers have 1-6 products from the bank. Most of the customers have 4 products.
Credit_Limit range is from 1438 (unit unknown) to 34516. The average value of Credit limit is 8631 and the median value is 4549. This feature is not normally distributed and right skewed.
Average Total_Revolving_Bal is 1163 showing that on an average customers carry over this much balance month on month. The average value is slightly lesser than median value- 1276 showing that this is not a normally distributed feature.
Avg_Open_To_Buy refers to the amount left on the credit card to use (Average of last 12 months). The mean value of this feature for all the customers is 7469 which is higher than the median value 3474. The range is from 3 to 34516, this is a large range that shows that some of the customers hardly use the creadit card however many cuatomers use the entire credit amount.
Total_Amt_Chng_Q4_Q1-The range of this ratio is 0 to 3.4, this is a large range, the mean value is 0.76 that shows that totoal transaction amount in Q4 is lesser than total transaction amount in Q1 on an average for a customer. We can further investigate if we are able to find the reason for this change.
Total_Ct_Chng_Q4_Q1- the range of this feature is 0-3.7. The mean and median values are 0.7 which again is less than 1 indicating that the number of transactions for on an average have reduced for the cuatomers in Q4 as compared to Q1. This is inline with what we saw for Total_Amt_Chng_Q4_Q1.
Total_Trans_Amt is the total transaction amount in the last 12 months, range is from 510 to 18484, this is a large range with mean value of 4404 and median of 3899. The mean and median are slightly close to each other, however the max value is farther away showing that there might be few outliers.
Total_Trans_Ct is the total transaction count in the last 12 months, the range is 10 transcations to 139 transactions, this is again a large range showing the varied usage among customers. The mean is 64 counts vs the median is 81 transactions, again there are few outliers that we will study further and the distribution is not normal.
Avg_Utilization_Ratio, the range os this feature is 0-0.99. The median of 0.5 is greater than mean of 0.3 which shows that this is not normally distributed.
# While doing uni-variate analysis of numerical variables we want to study their central tendency
# and dispersion.
# Let us write a function that will help us create a boxplot and histogram for any input numerical
# variable.
# This function takes the numerical column as the input and returns the boxplots
# and histograms for the variable.
# Let us see if this helps us write faster and cleaner code.
def histogram_boxplot(feature, figsize=(15, 10), bins=None):
"""Boxplot and histogram combined
feature: 1-d feature array
figsize: size of fig (default (9,8))
bins: number of bins (default None / auto)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.distplot(
feature, kde=F, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.distplot(
feature, kde=False, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
np.mean(feature), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
np.median(feature), color="black", linestyle="-"
) # Add median to the histogram
# Observations on Customer_age (Customer Profile)
histogram_boxplot(data["Customer_Age"])
#Months_on_book feature
histogram_boxplot(data["Months_on_book"])
# Credit_Limit feature
histogram_boxplot(data["Credit_Limit"])
# Total_Revolving_Bal feature
histogram_boxplot(data["Total_Revolving_Bal"])
# Avg_Open_To_Buy feature
histogram_boxplot(data["Avg_Open_To_Buy"])
# Total_Amt_Chng_Q4_Q1 feature
histogram_boxplot(data["Total_Amt_Chng_Q4_Q1"])
# Total_Trans_Amt feature
histogram_boxplot(data["Total_Trans_Amt"])
# Total_Trans_Ct feature
histogram_boxplot(data["Total_Trans_Ct"])
# Total_Ct_Chng_Q4_Q1 feature
histogram_boxplot(data["Total_Ct_Chng_Q4_Q1"])
# Avg_Utilization_Ratio feature
histogram_boxplot(data["Avg_Utilization_Ratio"])
def perc_on_bar(feature):
"""
plot
feature: categorical feature
the function won't work if a column is passed in the hue parameter
"""
# Creating a countplot for the feature
sns.set(rc={"figure.figsize": (10, 5)})
ax = sns.countplot(x=feature, data=data,palette="rocket")
total = len(feature) # length of the column
for p in ax.patches:
percentage = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
x = p.get_x() + p.get_width() / 2 - 0.1 # width of the plot
y = p.get_y() + p.get_height() # hieght of the plot
ax.annotate(percentage, (x, y), size=14) # annotate the percantage
plt.show() # show the plot
# Categorical feature Attrition_Flag
perc_on_bar(data["Attrition_Flag"])
# Categorical feature Gender
perc_on_bar(data["Gender"])
# Categorical feature Dependent_count
perc_on_bar(data["Dependent_count"])
# Categorical feature Education_Level
perc_on_bar(data["Education_Level"])
# Categorical feature Marital_Status
perc_on_bar(data["Marital_Status"])
# Categorical feature Income_Category
perc_on_bar(data["Income_Category"])
# Categorical feature Card_Category
perc_on_bar(data["Card_Category"])
# Categorical feature Total_Relationship_Count
perc_on_bar(data["Total_Relationship_Count"])
# Categorical feature Months_Inactive_12_mon
perc_on_bar(data["Months_Inactive_12_mon"])
# Categorical feature Contacts_Count_12_mon
perc_on_bar(data["Contacts_Count_12_mon"])
Let's try to compare features with respect to each other, we will try to focus on these relationships:
Attrition_Flag vs Months_on_book
Card_Category vs Credit_Limit
But first, let's pairplot the features
sns.pairplot(data, hue="Attrition_Flag")
<seaborn.axisgrid.PairGrid at 0x7fb4791ee310>
plt.figure(figsize=(10,5))
sns.heatmap(data.corr(),annot=True,vmin=-1,vmax=1,fmt='.2f')
plt.show()
# Attrition_Flag vs Contacts_Count_12_mon
print('Attrition_Flag vs Contacts_Count_12_mon')
print(' ')
print(data.groupby('Contacts_Count_12_mon')['Attrition_Flag'].value_counts(ascending=False))
sns.countplot(x=data['Attrition_Flag'],hue=data['Contacts_Count_12_mon']);#,palette="winter");
Attrition_Flag vs Contacts_Count_12_mon
Contacts_Count_12_mon Attrition_Flag
0 Existing Customer 392
Attrited Customer 7
1 Existing Customer 1391
Attrited Customer 108
2 Existing Customer 2824
Attrited Customer 403
3 Existing Customer 2699
Attrited Customer 681
4 Existing Customer 1077
Attrited Customer 315
5 Existing Customer 117
Attrited Customer 59
6 Attrited Customer 54
Name: Attrition_Flag, dtype: int64
# Attrition_Flag vs Months_Inactive_12_mon
print('Attrition_Flag vs Months_Inactive_12_mon')
print(' ')
print(data.groupby('Months_Inactive_12_mon')['Attrition_Flag'].value_counts(ascending=False))
sns.countplot(x=data['Attrition_Flag'],hue=data['Months_Inactive_12_mon']);#,palette="winter");
Attrition_Flag vs Months_Inactive_12_mon
Months_Inactive_12_mon Attrition_Flag
0 Attrited Customer 15
Existing Customer 14
1 Existing Customer 2133
Attrited Customer 100
2 Existing Customer 2777
Attrited Customer 505
3 Existing Customer 3020
Attrited Customer 826
4 Existing Customer 305
Attrited Customer 130
5 Existing Customer 146
Attrited Customer 32
6 Existing Customer 105
Attrited Customer 19
Name: Attrition_Flag, dtype: int64
# Attrition_Flag vs Total_Relationship_Count
print('Attrition_Flag vs Total_Relationship_Count')
print(' ')
print(data.groupby('Total_Relationship_Count')['Attrition_Flag'].value_counts(ascending=False))
sns.countplot(x=data['Attrition_Flag'],hue=data['Total_Relationship_Count']);#,palette="winter");
Attrition_Flag vs Total_Relationship_Count
Total_Relationship_Count Attrition_Flag
1 Existing Customer 677
Attrited Customer 233
2 Existing Customer 897
Attrited Customer 346
3 Existing Customer 1905
Attrited Customer 400
4 Existing Customer 1687
Attrited Customer 225
5 Existing Customer 1664
Attrited Customer 227
6 Existing Customer 1670
Attrited Customer 196
Name: Attrition_Flag, dtype: int64
# Attrition_Flag vs Card_Category
print('Attrition_Flag vs Card_Category')
print(' ')
print(data.groupby('Card_Category')['Attrition_Flag'].value_counts(ascending=False))
sns.countplot(x=data['Attrition_Flag'],hue=data['Card_Category']);#,palette="winter");
Attrition_Flag vs Card_Category
Card_Category Attrition_Flag
Blue Existing Customer 7917
Attrited Customer 1519
Gold Existing Customer 95
Attrited Customer 21
Platinum Existing Customer 15
Attrited Customer 5
Silver Existing Customer 473
Attrited Customer 82
Name: Attrition_Flag, dtype: int64
# Attrition_Flag vs Income_Category
print('Attrition_Flag vs Income_Category')
print(' ')
print(data.groupby('Income_Category')['Attrition_Flag'].value_counts(ascending=False))
sns.countplot(x=data['Attrition_Flag'],hue=data['Income_Category']);#,palette="winter");
Attrition_Flag vs Income_Category
Income_Category Attrition_Flag
$120K + Existing Customer 601
Attrited Customer 126
$40K - $60K Existing Customer 1519
Attrited Customer 271
$60K - $80K Existing Customer 1213
Attrited Customer 189
$80K - $120K Existing Customer 1293
Attrited Customer 242
Less than $40K Existing Customer 2949
Attrited Customer 612
Unknown Existing Customer 925
Attrited Customer 187
Name: Attrition_Flag, dtype: int64
# Attrition_Flag vs Total_Revolving_Bal
print('Attrition_Flag vs Total_Revolving_Bal')
print(' ')
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(y="Total_Revolving_Bal", x="Attrition_Flag", data=data, orient="vertical");
Attrition_Flag vs Total_Revolving_Bal
# Attrition_Flag vs Months_on_book
print('Attrition_Flag vs Months_on_book')
print(' ')
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(y="Months_on_book", x="Attrition_Flag", data=data, orient="vertical");
Attrition_Flag vs Months_on_book
cols = data[["Avg_Utilization_Ratio", "Total_Ct_Chng_Q4_Q1", "Total_Amt_Chng_Q4_Q1", "Total_Trans_Amt","Total_Trans_Ct","Avg_Open_To_Buy"]].columns.tolist()
plt.figure(figsize=(10, 10))
for i, variable in enumerate(cols):
plt.subplot(3, 2, i + 1)
sns.boxplot(data["Attrition_Flag"], data[variable])
plt.tight_layout()
plt.title(variable)
plt.show()
#Credit Card
# Card_Category vs Credit_Limit
print('Card_Category vs Credit_Limit')
print(' ')
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(y="Credit_Limit", x="Card_Category", data=data, orient="vertical");
Card_Category vs Credit_Limit
# Card_Category vs Income_Category
print('Card_Category vs Income_Category')
print(' ')
print(data.groupby('Income_Category')['Card_Category'].value_counts(ascending=False))
sns.countplot(x=data['Card_Category'],hue=data['Income_Category']);#,palette="winter");
Card_Category vs Income_Category
Income_Category Card_Category
$120K + Blue 645
Silver 60
Gold 18
Platinum 4
$40K - $60K Blue 1675
Silver 99
Gold 15
Platinum 1
$60K - $80K Blue 1273
Silver 96
Gold 29
Platinum 4
$80K - $120K Blue 1395
Silver 117
Gold 21
Platinum 2
Less than $40K Blue 3403
Silver 130
Gold 24
Platinum 4
Unknown Blue 1045
Silver 53
Gold 9
Platinum 5
Name: Card_Category, dtype: int64
cols = data[["Total_Revolving_Bal", "Total_Trans_Ct", "Total_Trans_Amt", "Avg_Utilization_Ratio","Avg_Open_To_Buy"]].columns.tolist()
plt.figure(figsize=(10, 10))
for i, variable in enumerate(cols):
plt.subplot(3, 2, i + 1)
sns.boxplot(data["Card_Category"], data[variable])
plt.tight_layout()
plt.title(variable)
plt.show()
We have seen that few categorical columns have "Unknown" values, these reprensent missing values in the dataset. We will treat these missing values with KNN imputer. The features of interest that have "Unknown" values are:
Few Continous features that are ratios have "0" value that indicate that the the numerator for those ratios was marked zero (null value). Since a ratio cannot be zero, we will also treat these zeros as missing values and use KNN imputer to treat these. The features of interest that have "0" value of ratio are:
data['Education_Level'] = data['Education_Level'].replace({'Unknown':np.nan})
data['Marital_Status'] = data['Marital_Status'].replace({'Unknown':np.nan})
data['Income_Category'] = data['Income_Category'].replace({'Unknown':np.nan})
data['Total_Amt_Chng_Q4_Q1'] = data['Total_Amt_Chng_Q4_Q1'].replace({0:np.nan})
data['Total_Ct_Chng_Q4_Q1'] = data['Total_Ct_Chng_Q4_Q1'].replace({0:np.nan})
data['Avg_Utilization_Ratio'] = data['Avg_Utilization_Ratio'].replace({0:np.nan})
data.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 5 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 7 Avg_Utilization_Ratio 2470 dtype: int64
We have to impute values for following categorical columns:
- Education_Level
- Marital_Status
- Income_Category
and following numerical columns:
- Total_Amt_Chng_Q4_Q1
- Total_Ct_Chng_Q4_Q1
- Avg_Utilization_Ratio
reqd_col_for_impute = ['Marital_Status','Income_Category','Education_Level','Total_Amt_Chng_Q4_Q1','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']
# we need to pass numerical values for each categorical column for KNN imputation so we will label encode them
Marital_Status = {"Married": 1, "Single":2 , "Divorced": 3}
data['Marital_Status'] = data['Marital_Status'].map(Marital_Status)
Income_Category = {"Less than $40K": 1, "$40K - $60K":2 , "$60K - $80K": 3, "$80K - $120K": 4,"$120K +":5}
data['Income_Category'] = data['Income_Category'].map(Income_Category)
Education_Level= {"Uneducated": 1 ,"High School": 2,"College": 3 ,"Graduate": 4,"Post-Graduate":5,"Doctorate":6}
data['Education_Level'] = data['Education_Level'].map(Education_Level)
data.head(10)
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Existing Customer | 45 | M | 3 | 2.0 | 1.0 | 3.0 | Blue | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | Existing Customer | 49 | F | 5 | 4.0 | 2.0 | 1.0 | Blue | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | Existing Customer | 51 | M | 3 | 4.0 | 1.0 | 4.0 | Blue | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | NaN |
| 3 | Existing Customer | 40 | F | 4 | 2.0 | NaN | 1.0 | Blue | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | Existing Customer | 40 | M | 3 | 1.0 | 1.0 | 3.0 | Blue | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | NaN |
| 5 | Existing Customer | 44 | M | 2 | 4.0 | 1.0 | 2.0 | Blue | 36 | 3 | 1 | 2 | 4010.0 | 1247 | 2763.0 | 1.376 | 1088 | 24 | 0.846 | 0.311 |
| 6 | Existing Customer | 51 | M | 4 | NaN | 1.0 | 5.0 | Gold | 46 | 6 | 1 | 3 | 34516.0 | 2264 | 32252.0 | 1.975 | 1330 | 31 | 0.722 | 0.066 |
| 7 | Existing Customer | 32 | M | 0 | 2.0 | NaN | 3.0 | Silver | 27 | 2 | 2 | 2 | 29081.0 | 1396 | 27685.0 | 2.204 | 1538 | 36 | 0.714 | 0.048 |
| 8 | Existing Customer | 37 | M | 3 | 1.0 | 2.0 | 3.0 | Blue | 36 | 5 | 2 | 0 | 22352.0 | 2517 | 19835.0 | 3.355 | 1350 | 24 | 1.182 | 0.113 |
| 9 | Existing Customer | 48 | M | 2 | 4.0 | 2.0 | 4.0 | Blue | 36 | 6 | 3 | 3 | 11656.0 | 1677 | 9979.0 | 1.524 | 1441 | 32 | 0.882 | 0.144 |
data['Attrition_Flag']=data['Attrition_Flag'].astype('category')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null float64 5 Marital_Status 9378 non-null float64 6 Income_Category 9015 non-null float64 7 Card_Category 10127 non-null object 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10122 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10120 non-null float64 19 Avg_Utilization_Ratio 7657 non-null float64 dtypes: category(1), float64(8), int64(9), object(2) memory usage: 1.5+ MB
# Separating target variable and other variables
X = data.drop(columns="Attrition_Flag",axis=1)
Y = data["Attrition_Flag"].apply(lambda x : 1 if x=='Attrited Customer' else 0)
# Splitting data into training and test set, since we will use k fold technique for training and testing,
#we will skip splitting the data into validation set.
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.3, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 19) (3039, 19)
print(Y_train.shape, Y_test.shape)
(7088,) (3039,)
imputer = KNNImputer(n_neighbors=5)
#Fit and transform the train data
X_train[reqd_col_for_impute]=imputer.fit_transform(X_train[reqd_col_for_impute])
#Transform the test data
X_test[reqd_col_for_impute]=imputer.transform(X_test[reqd_col_for_impute])
#Checking that no column has missing values in train or test sets
print(X_train.isna().sum())
print('-'*30)
print(X_test.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
## Function to inverse the encoding
def inverse_mapping(x,y):
inv_dict = {v: k for k, v in x.items()}
X_train[y] = np.round(X_train[y]).map(inv_dict).astype('category')
X_test[y] = np.round(X_test[y]).map(inv_dict).astype('category')
inverse_mapping(Marital_Status,'Marital_Status')
inverse_mapping(Income_Category,'Income_Category')
inverse_mapping(Education_Level,'Education_Level')
# Columns in train set
cols = X_train.select_dtypes(include=['object','category'])
for i in cols.columns:
print(X_train[i].value_counts())
print('*'*30)
F 3770 M 3318 Name: Gender, dtype: int64 ****************************** Graduate 2404 High School 1635 College 1331 Uneducated 1035 Post-Graduate 371 Doctorate 312 Name: Education_Level, dtype: int64 ****************************** Married 3479 Single 3105 Divorced 504 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 2537 $40K - $60K 1612 $60K - $80K 1304 $80K - $120K 1132 $120K + 503 Name: Income_Category, dtype: int64 ****************************** Blue 6621 Silver 375 Gold 78 Platinum 14 Name: Card_Category, dtype: int64 ******************************
# Columns in test set
cols = X_test.select_dtypes(include=['object','category'])
for i in cols.columns:
print(X_test[i].value_counts())
print('*'*30)
F 1588 M 1451 Name: Gender, dtype: int64 ****************************** Graduate 1043 High School 683 College 562 Uneducated 457 Post-Graduate 155 Doctorate 139 Name: Education_Level, dtype: int64 ****************************** Married 1469 Single 1324 Divorced 246 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 1093 $40K - $60K 687 $60K - $80K 557 $80K - $120K 478 $120K + 224 Name: Income_Category, dtype: int64 ****************************** Blue 2815 Silver 180 Gold 38 Platinum 6 Name: Card_Category, dtype: int64 ******************************
X_train=pd.get_dummies(X_train,drop_first=True)
X_test=pd.get_dummies(X_test,drop_first=True)
print(X_train.shape, X_test.shape)
(7088, 29) (3039, 29)
print(X_train.info())
print(X_test.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 7088 entries, 4124 to 4752 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 7088 non-null int64 1 Dependent_count 7088 non-null int64 2 Months_on_book 7088 non-null int64 3 Total_Relationship_Count 7088 non-null int64 4 Months_Inactive_12_mon 7088 non-null int64 5 Contacts_Count_12_mon 7088 non-null int64 6 Credit_Limit 7088 non-null float64 7 Total_Revolving_Bal 7088 non-null int64 8 Avg_Open_To_Buy 7088 non-null float64 9 Total_Amt_Chng_Q4_Q1 7088 non-null float64 10 Total_Trans_Amt 7088 non-null int64 11 Total_Trans_Ct 7088 non-null int64 12 Total_Ct_Chng_Q4_Q1 7088 non-null float64 13 Avg_Utilization_Ratio 7088 non-null float64 14 Gender_M 7088 non-null uint8 15 Education_Level_Doctorate 7088 non-null uint8 16 Education_Level_Graduate 7088 non-null uint8 17 Education_Level_High School 7088 non-null uint8 18 Education_Level_Post-Graduate 7088 non-null uint8 19 Education_Level_Uneducated 7088 non-null uint8 20 Marital_Status_Married 7088 non-null uint8 21 Marital_Status_Single 7088 non-null uint8 22 Income_Category_$40K - $60K 7088 non-null uint8 23 Income_Category_$60K - $80K 7088 non-null uint8 24 Income_Category_$80K - $120K 7088 non-null uint8 25 Income_Category_Less than $40K 7088 non-null uint8 26 Card_Category_Gold 7088 non-null uint8 27 Card_Category_Platinum 7088 non-null uint8 28 Card_Category_Silver 7088 non-null uint8 dtypes: float64(5), int64(9), uint8(15) memory usage: 934.5 KB None <class 'pandas.core.frame.DataFrame'> Int64Index: 3039 entries, 7403 to 8523 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 3039 non-null int64 1 Dependent_count 3039 non-null int64 2 Months_on_book 3039 non-null int64 3 Total_Relationship_Count 3039 non-null int64 4 Months_Inactive_12_mon 3039 non-null int64 5 Contacts_Count_12_mon 3039 non-null int64 6 Credit_Limit 3039 non-null float64 7 Total_Revolving_Bal 3039 non-null int64 8 Avg_Open_To_Buy 3039 non-null float64 9 Total_Amt_Chng_Q4_Q1 3039 non-null float64 10 Total_Trans_Amt 3039 non-null int64 11 Total_Trans_Ct 3039 non-null int64 12 Total_Ct_Chng_Q4_Q1 3039 non-null float64 13 Avg_Utilization_Ratio 3039 non-null float64 14 Gender_M 3039 non-null uint8 15 Education_Level_Doctorate 3039 non-null uint8 16 Education_Level_Graduate 3039 non-null uint8 17 Education_Level_High School 3039 non-null uint8 18 Education_Level_Post-Graduate 3039 non-null uint8 19 Education_Level_Uneducated 3039 non-null uint8 20 Marital_Status_Married 3039 non-null uint8 21 Marital_Status_Single 3039 non-null uint8 22 Income_Category_$40K - $60K 3039 non-null uint8 23 Income_Category_$60K - $80K 3039 non-null uint8 24 Income_Category_$80K - $120K 3039 non-null uint8 25 Income_Category_Less than $40K 3039 non-null uint8 26 Card_Category_Gold 3039 non-null uint8 27 Card_Category_Platinum 3039 non-null uint8 28 Card_Category_Silver 3039 non-null uint8 dtypes: float64(5), int64(9), uint8(15) memory usage: 400.6 KB None
Predicting a customer will Attirate and the customer doesn't Attirate - Loss of resources Bank spends on improving services for this customer.
Predicting a customer will not Attirate and the customer Attirates - Loss of clientale when customer renounces their credit card.
## Function to calculate different metric scores of the model - Accuracy, Recall and Precision
def get_metrics_score(model,train,test,train_y,test_y,flag=True):
'''
model : classifier to predict values of X
'''
# defining an empty list to store train and test results
score_list=[]
pred_train = model.predict(train)
pred_test = model.predict(test)
train_acc = model.score(train,train_y)
test_acc = model.score(test,test_y)
train_recall = metrics.recall_score(train_y,pred_train)
test_recall = metrics.recall_score(test_y,pred_test)
train_precision = metrics.precision_score(train_y,pred_train)
test_precision = metrics.precision_score(test_y,pred_test)
score_list.extend((train_acc,test_acc,train_recall,test_recall,train_precision,test_precision))
# If the flag is set to True then only the following print statements will be displayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",model.score(train,train_y))
print("Accuracy on test set : ",model.score(test,test_y))
print("Recall on training set : ",metrics.recall_score(train_y,pred_train))
print("Recall on test set : ",metrics.recall_score(test_y,pred_test))
print("Precision on training set : ",metrics.precision_score(train_y,pred_train))
print("Precision on test set : ",metrics.precision_score(test_y,pred_test))
return score_list # returning the list with train and test scores
def make_confusion_matrix(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict values of X
y_actual : ground truth
'''
y_predict = model.predict(X_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Actual - No","Actual - Yes"]],
columns = [i for i in ['Predicted - No','Predicted - Yes']])
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}" for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=labels,fmt='')
plt.ylabel('True label')
plt.xlabel('Predicted label')
lr = LogisticRegression(random_state=1)
lr.fit(X_train,Y_train)
LogisticRegression(random_state=1)
Let's evaluate the model performance by using KFold and cross_val_score
K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive stratified folds (without shuffling by default). Each fold is then used once as validation while the k - 1 remaining folds form the training set.scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_bfr=cross_val_score(estimator=lr, X=X_train, y=Y_train, scoring=scoring, cv=kfold)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
#Calculating different metrics
scores_LR = get_metrics_score(lr,X_train,X_test,Y_train,Y_test)
# creating confusion matrix
make_confusion_matrix(lr,Y_test)
Accuracy on training set : 0.8776805869074492 Accuracy on test set : 0.8798947022046726 Recall on training set : 0.43459174714661986 Recall on test set : 0.4385245901639344 Precision on training set : 0.6894150417827298 Precision on test set : 0.7016393442622951
#!pip install -U imbalanced-learn
Collecting imbalanced-learn
Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
|████████████████████████████████| 206 kB 714 kB/s eta 0:00:01
Requirement already satisfied: joblib>=0.11 in /Users/smathur7/anaconda3/lib/python3.8/site-packages (from imbalanced-learn) (1.0.1)
Requirement already satisfied: numpy>=1.13.3 in /Users/smathur7/anaconda3/lib/python3.8/site-packages (from imbalanced-learn) (1.20.1)
Requirement already satisfied: scikit-learn>=0.24 in /Users/smathur7/anaconda3/lib/python3.8/site-packages (from imbalanced-learn) (0.24.1)
Requirement already satisfied: scipy>=0.19.1 in /Users/smathur7/anaconda3/lib/python3.8/site-packages (from imbalanced-learn) (1.6.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/smathur7/anaconda3/lib/python3.8/site-packages (from scikit-learn>=0.24->imbalanced-learn) (2.1.0)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.8.0
from imblearn.over_sampling import SMOTE
print("Before UpSampling, counts of label 'Yes': {}".format(sum(Y_train==1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(Y_train==0)))
sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1) #Synthetic Minority Over Sampling Technique
X_train_over, Y_train_over = sm.fit_resample(X_train, Y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(Y_train_over==1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(Y_train_over==0)))
print('After UpSampling, the shape of train_X: {}'.format(X_train_over.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(Y_train_over.shape))
Before UpSampling, counts of label 'Yes': 1139 Before UpSampling, counts of label 'No': 5949 After UpSampling, counts of label 'Yes': 5949 After UpSampling, counts of label 'No': 5949 After UpSampling, the shape of train_X: (11898, 29) After UpSampling, the shape of train_y: (11898,)
log_reg_over = LogisticRegression(random_state = 1)
# Training the basic logistic regression model with training set
log_reg_over.fit(X_train_over,Y_train_over)
LogisticRegression(random_state=1)
Let's evaluate the model performance by using KFold and cross_val_score
K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive stratified folds (without shuffling by default). Each fold is then used once as validation while the k - 1 remaining folds form the training set.scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_over=cross_val_score(estimator=log_reg_over, X=X_train_over, y=Y_train_over, scoring=scoring, cv=kfold)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
#Calculating different metrics
get_metrics_score(log_reg_over,X_train_over,X_test,Y_train_over,Y_test)
# creating confusion matrix
make_confusion_matrix(log_reg_over,Y_test)
Accuracy on training set : 0.8222390317700454 Accuracy on test set : 0.8002632444883185 Recall on training set : 0.8147587829887376 Recall on test set : 0.7725409836065574 Precision on training set : 0.8271331058020478 Precision on test set : 0.43184421534937
Lets try:
a) Regularization to see if we can improve the performance further.
b) Undersampling the train to handle the imbalance between classes and check the model performance.
# Choose the type of classifier.
lr_estimator = LogisticRegression(random_state=1,solver='saga')
# Grid of parameters to choose from
parameters = {'C': np.arange(0.1,1.1,0.1)}
# Run the grid search
grid_obj = GridSearchCV(lr_estimator, parameters, scoring='recall')
grid_obj = grid_obj.fit(X_train_over, Y_train_over)
# Set the clf to the best combination of parameters
lr_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
lr_estimator.fit(X_train_over, Y_train_over)
LogisticRegression(C=0.1, random_state=1, solver='saga')
#Calculating different metrics
get_metrics_score(lr_estimator,X_train_over,X_test,Y_train_over,Y_test)
# creating confusion matrix
make_confusion_matrix(lr_estimator,Y_test)
Accuracy on training set : 0.7100353000504287 Accuracy on test set : 0.7857847976307996 Recall on training set : 0.578416540595058 Recall on test set : 0.5245901639344263 Precision on training set : 0.78507871321013 Precision on test set : 0.37925925925925924
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state = 1)
X_train_un, Y_train_un = rus.fit_resample(X_train, Y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(Y_train==1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(Y_train==0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(Y_train_un==1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(Y_train_un==0)))
print('After Under Sampling, the shape of train_X: {}'.format(X_train_un.shape))
print('After Under Sampling, the shape of train_y: {} \n'.format(Y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 1139 Before Under Sampling, counts of label 'No': 5949 After Under Sampling, counts of label 'Yes': 1139 After Under Sampling, counts of label 'No': 1139 After Under Sampling, the shape of train_X: (2278, 29) After Under Sampling, the shape of train_y: (2278,)
log_reg_under = LogisticRegression(random_state = 1)
log_reg_under.fit(X_train_un,Y_train_un )
LogisticRegression(random_state=1)
Let's evaluate the model performance by using KFold and cross_val_score
K-Folds cross-validation provides dataset indices to split data into train/validation sets. Split dataset into k consecutive stratified folds (without shuffling by default). Each fold is then used once as validation while the k - 1 remaining folds form the training set.scoring='recall'
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=1) #Setting number of splits equal to 5
cv_result_under=cross_val_score(estimator=log_reg_under, X=X_train_un, y=Y_train_un, scoring=scoring, cv=kfold)
#Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
#Calculating different metrics
get_metrics_score(log_reg_under,X_train_un,X_test,Y_train_un,Y_test)
# creating confusion matrix
make_confusion_matrix(log_reg_under,Y_test)
Accuracy on training set : 0.7949956101843723 Accuracy on test set : 0.7736097400460678 Recall on training set : 0.7945566286215979 Recall on test set : 0.7745901639344263 Precision on training set : 0.7952548330404218 Precision on test set : 0.39539748953974896
# defining list of model
models = [lr]
# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train,X_test,Y_train,Y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
# defining list of models
models = [log_reg_over, lr_estimator]
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train_over,X_test,Y_train_over,Y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
# defining list of model
models = [log_reg_under]
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score(model,X_train_un,X_test,Y_train_un,Y_test,False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
comparison_frame = pd.DataFrame({'Model':['Logistic Regression','Logistic Regression on Oversampled data',
'Logistic Regression-Regularized (Oversampled data)','Logistic Regression on Undersampled data'],
'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
'Train_Recall':recall_train,'Test_Recall':recall_test,
'Train_Precision':precision_train,'Test_Precision':precision_test})
#Sorting models in decreasing order of test recall
comparison_frame
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | |
|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.877681 | 0.879895 | 0.434592 | 0.438525 | 0.689415 | 0.701639 |
| 1 | Logistic Regression on Oversampled data | 0.822239 | 0.800263 | 0.814759 | 0.772541 | 0.827133 | 0.431844 |
| 2 | Logistic Regression-Regularized (Oversampled d... | 0.710035 | 0.785785 | 0.578417 | 0.524590 | 0.785079 | 0.379259 |
| 3 | Logistic Regression on Undersampled data | 0.794996 | 0.773610 | 0.794557 | 0.774590 | 0.795255 | 0.395397 |
log_odds = log_reg_under.coef_[0]
pd.DataFrame(log_odds, X_train_un.columns, columns=['coef']).T
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | ... | Education_Level_Uneducated | Marital_Status_Married | Marital_Status_Single | Income_Category_$40K - $60K | Income_Category_$60K - $80K | Income_Category_$80K - $120K | Income_Category_Less than $40K | Card_Category_Gold | Card_Category_Platinum | Card_Category_Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| coef | 0.128343 | 0.153036 | -0.076064 | -0.077487 | 0.164096 | 0.239661 | -0.000287 | -0.000568 | 0.000281 | 0.003731 | ... | 0.004185 | -0.012732 | 0.039185 | 0.013153 | -0.003233 | -0.004856 | 0.030207 | 0.00231 | 0.001115 | 0.001544 |
1 rows × 29 columns
Coefficient of Customer_Age, Dependent_count, Months_Inactive_12_mon, Contacts_Count_12_mon, Avg_Open_To_Buy, Total_Amt_Chng_Q4_Q1, Education_Level_Uneducated, Marital_Status_Single, IncomeCategory 40𝐾− 60K, IncomeCategory 40𝐾− 60K, Card_Category_Gold, Card_Category_Platinum and Card_Category_Silver increase in these will lead to an increase in chances of a customer Attirating.
Coefficient of Months_on_book, Total_Relationship_Count, Credit_Limit, Total_Revolving_Bal, Marital_Status_Married, IncomeCategory 60𝐾− 80K and IncomeCategory 80𝐾− 120K are negative increase in these will lead to a decrease in chances of a person Attirating.
odds = np.exp(log_reg_under.coef_[0]) # converting coefficients to odds
pd.set_option('display.max_columns',None) # removing limit from number of columns to display
pd.DataFrame(odds, X_train.columns, columns=['odds']).T # adding the odds to a dataframe
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Gender_M | Education_Level_Doctorate | Education_Level_Graduate | Education_Level_High School | Education_Level_Post-Graduate | Education_Level_Uneducated | Marital_Status_Married | Marital_Status_Single | Income_Category_$40K - $60K | Income_Category_$60K - $80K | Income_Category_$80K - $120K | Income_Category_Less than $40K | Card_Category_Gold | Card_Category_Platinum | Card_Category_Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| odds | 1.136943 | 1.165367 | 0.926757 | 0.925439 | 1.178328 | 1.270819 | 0.999713 | 0.999432 | 1.000281 | 1.003738 | 1.00037 | 0.915428 | 0.974503 | 1.023922 | 0.982737 | 1.004821 | 1.004111 | 0.999181 | 1.008766 | 1.004194 | 0.987349 | 1.039963 | 1.01324 | 0.996772 | 0.995156 | 1.030667 | 1.002313 | 1.001115 | 1.001545 |
perc_change_odds = (np.exp(log_reg_under.coef_[0])-1)*100 # finding the percentage change
pd.set_option('display.max_columns',None) # removing limit from number of columns to display
pd.DataFrame(perc_change_odds, X_train.columns, columns=['change_odds%']).T # adding the change_odds% to a dataframe
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Gender_M | Education_Level_Doctorate | Education_Level_Graduate | Education_Level_High School | Education_Level_Post-Graduate | Education_Level_Uneducated | Marital_Status_Married | Marital_Status_Single | Income_Category_$40K - $60K | Income_Category_$60K - $80K | Income_Category_$80K - $120K | Income_Category_Less than $40K | Card_Category_Gold | Card_Category_Platinum | Card_Category_Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| change_odds% | 13.694336 | 16.53671 | -7.324323 | -7.456087 | 17.832777 | 27.081875 | -0.028721 | -0.056792 | 0.028087 | 0.373774 | 0.036964 | -8.457175 | -2.549737 | 2.392244 | -1.72629 | 0.48211 | 0.411143 | -0.08185 | 0.876601 | 0.419364 | -1.265143 | 3.996285 | 1.323997 | -0.322812 | -0.484412 | 3.066739 | 0.231279 | 0.111515 | 0.154509 |
Customer_Age: Holding all other features constant a 1 unit change in Customer_Age will increase the odds of a customer Attirating by 13.69%.
Dependent_count: Holding all other features constant a 1 unit change in Dependent_count will increase the odds of a customer Attirating by 16.5%.
The top four features that positively effect Attiration are
The top three features that negatively effect Attiration are
We will use Pipeline to build these models.
models = [] # Empty list to store all the models
# Appending pipelines for each model into the list
models.append(
(
"DTREE",
Pipeline(
steps=[
("scaler", StandardScaler()),
("decision_tree", DecisionTreeClassifier(random_state=1)),
]
),
)
)
models.append(
(
"RF",
Pipeline(
steps=[
("scaler", StandardScaler()),
("random_forest", RandomForestClassifier(random_state=1)),
]
),
)
)
models.append(
(
"GBM",
Pipeline(
steps=[
("scaler", StandardScaler()),
("gradient_boosting", GradientBoostingClassifier(random_state=1)),
]
),
)
)
models.append(
(
"ADB",
Pipeline(
steps=[
("scaler", StandardScaler()),
("adaboost", AdaBoostClassifier(random_state=1)),
]
),
)
)
models.append(
(
"XGB",
Pipeline(
steps=[
("scaler", StandardScaler()),
("xgboost", XGBClassifier(random_state=1,eval_metric='logloss')),
]
),
)
)
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=Y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
DTREE: 78.92727413246774 RF: 77.87386969626709 GBM: 83.40598191514029 ADB: 83.40714120102018 XGB: 86.82935311847902
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
## Function to calculate different metric scores of the model - Accuracy, Recall and Precision
def get_metrics_score_tuned(model, flag=True):
"""
model: classifier to predict values of X
"""
# defining an empty list to store train and test results
score_list = []
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
train_acc = model.score(X_train, Y_train)
test_acc = model.score(X_test, Y_test)
train_recall = metrics.recall_score(Y_train, pred_train)
test_recall = metrics.recall_score(Y_test, pred_test)
train_precision = metrics.precision_score(Y_train, pred_train)
test_precision = metrics.precision_score(Y_test, pred_test)
score_list.extend(
(
train_acc,
test_acc,
train_recall,
test_recall,
train_precision,
test_precision,
)
)
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ", model.score(X_train, Y_train))
print("Accuracy on test set : ", model.score(X_test, Y_test))
print("Recall on training set : ", metrics.recall_score(Y_train, pred_train))
print("Recall on test set : ", metrics.recall_score(Y_test, pred_test))
print(
"Precision on training set : ", metrics.precision_score(Y_train, pred_train)
)
print("Precision on test set : ", metrics.precision_score(Y_test, pred_test))
return score_list # returning the list with train and test scores
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(), XGBClassifier(random_state=1,eval_metric='logloss'))
#Parameter grid to pass in GridSearchCV
param_grid={'xgbclassifier__n_estimators':np.arange(50,300,50),'xgbclassifier__scale_pos_weight':[0,1,2,5,10],
'xgbclassifier__learning_rate':[0.01,0.1,0.2,0.05], 'xgbclassifier__gamma':[0,1,3,5],
'xgbclassifier__subsample':[0.7,0.8,0.9,1]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
#Fitting parameters in GridSeachCV
grid_cv.fit(X_train,Y_train)
print("Best parameters are {} with CV score={}:" .format(grid_cv.best_params_,grid_cv.best_score_))
Best parameters are {'xgbclassifier__gamma': 5, 'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__n_estimators': 250, 'xgbclassifier__scale_pos_weight': 10, 'xgbclassifier__subsample': 0.7} with CV score=0.9508385501197928:
CPU times: user 34.4 s, sys: 3.05 s, total: 37.5 s
Wall time: 1h 8min 28s
# Creating new pipeline with best parameters
xgb_tuned1 = make_pipeline(
StandardScaler(),
XGBClassifier(
random_state=1,
n_estimators=250,
scale_pos_weight=10,
subsample=0.7,
learning_rate=0.1,
gamma=5,
eval_metric='logloss',
),
)
# Fit the model on training data
xgb_tuned1.fit(X_train, Y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('xgbclassifier',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, eval_metric='logloss',
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1,
max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=250,
n_jobs=12, num_parallel_tree=1, random_state=1,
reg_alpha=0, reg_lambda=1, scale_pos_weight=10,
subsample=0.7, tree_method='exact',
validate_parameters=1, verbosity=None))])
# Calculating different metrics
get_metrics_score_tuned(xgb_tuned1)
# Creating confusion matrix
make_confusion_matrix(xgb_tuned1, Y_test)
Accuracy on training set : 0.9887133182844243 Accuracy on test set : 0.9697268838433696 Recall on training set : 1.0 Recall on test set : 0.9610655737704918 Precision on training set : 0.9343724364232978 Precision on test set : 0.8653136531365314
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
"adaboostclassifier__n_estimators": np.arange(10, 110, 10),
"adaboostclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, Y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
Best Parameters:{'adaboostclassifier__base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1), 'adaboostclassifier__learning_rate': 1, 'adaboostclassifier__n_estimators': 60}
Score: 0.8709444315634902
CPU times: user 2.86 s, sys: 321 ms, total: 3.18 s
Wall time: 1min 13s
# Creating new pipeline with best parameters
abc_tuned1 = make_pipeline(
StandardScaler(),
AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(max_depth=2, random_state=1),
n_estimators=60,
learning_rate=1,
random_state=1,
),
)
# Fit the model on training data
abc_tuned1.fit(X_train, Y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('adaboostclassifier',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=60,
random_state=1))])
# Calculating different metrics
get_metrics_score_tuned(abc_tuned1)
# Creating confusion matrix
make_confusion_matrix(abc_tuned1, Y_test)
Accuracy on training set : 0.9861738148984198 Accuracy on test set : 0.9726883843369529 Recall on training set : 0.9543459174714662 Recall on test set : 0.8975409836065574 Precision on training set : 0.9593998234774934 Precision on test set : 0.9299363057324841
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1))
# Parameter grid to pass in GridSearchCV
param_grid = {
"gradientboostingclassifier__n_estimators": np.arange(10, 110, 10),
"gradientboostingclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"gradientboostingclassifier__max_depth":[3,5,8],
"gradientboostingclassifier__criterion": ["friedman_mse", "mae"],
"gradientboostingclassifier__n_estimators":[10]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, Y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
Best Parameters:{'gradientboostingclassifier__criterion': 'friedman_mse', 'gradientboostingclassifier__learning_rate': 1, 'gradientboostingclassifier__max_depth': 5, 'gradientboostingclassifier__n_estimators': 10}
Score: 0.8182664811809259
CPU times: user 669 ms, sys: 151 ms, total: 820 ms
Wall time: 7min 49s
# Creating new pipeline with best parameters
gbc_tuned1 = make_pipeline(
StandardScaler(),
GradientBoostingClassifier(
n_estimators=10,
learning_rate=1,
random_state=1,
criterion='friedman_mse'
),
)
# Fit the model on training data
gbc_tuned1.fit(X_train, Y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('gradientboostingclassifier',
GradientBoostingClassifier(learning_rate=1, n_estimators=10,
random_state=1))])
# Calculating different metrics
get_metrics_score_tuned(gbc_tuned1)
# Creating confusion matrix
make_confusion_matrix(gbc_tuned1, Y_test)
Accuracy on training set : 0.9626128668171557 Accuracy on test set : 0.9509707140506746 Recall on training set : 0.8568920105355575 Recall on test set : 0.8401639344262295 Precision on training set : 0.9053803339517625 Precision on test set : 0.8523908523908524
%%time
#Creating pipeline
pipe=make_pipeline(StandardScaler(),XGBClassifier(random_state=1,eval_metric='logloss', n_estimators = 50))
#Parameter grid to pass in RandomizedSearchCV
param_grid={'xgbclassifier__n_estimators':np.arange(50,300,50),
'xgbclassifier__scale_pos_weight':[0,1,2,5,10],
'xgbclassifier__learning_rate':[0.01,0.1,0.2,0.05],
'xgbclassifier__gamma':[0,1,3,5],
'xgbclassifier__subsample':[0.7,0.8,0.9,1],
'xgbclassifier__max_depth':np.arange(1,10,1),
'xgbclassifier__reg_lambda':[0,1,2,5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,Y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'xgbclassifier__subsample': 0.8, 'xgbclassifier__scale_pos_weight': 10, 'xgbclassifier__reg_lambda': 10, 'xgbclassifier__n_estimators': 100, 'xgbclassifier__max_depth': 2, 'xgbclassifier__learning_rate': 0.2, 'xgbclassifier__gamma': 5} with CV score=0.9648929592704227:
CPU times: user 19min 5s, sys: 31.7 s, total: 19min 36s
Wall time: 1min 46s
# Creating new pipeline with best parameters
xgb_tuned2 = Pipeline(
steps=[
("scaler", StandardScaler()),
(
"XGB",
XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=10,
gamma=5,
subsample=0.8,
learning_rate= 0.2,
eval_metric='logloss', max_depth = 2, reg_lambda = 10
),
),
]
)
# Fit the model on training data
xgb_tuned2.fit(X_train, Y_train)
Pipeline(steps=[('scaler', StandardScaler()),
('XGB',
XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, eval_metric='logloss',
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.2,
max_delta_step=0, max_depth=2,
min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100,
n_jobs=12, num_parallel_tree=1, random_state=1,
reg_alpha=0, reg_lambda=10, scale_pos_weight=10,
subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None))])
# Calculating different metrics
get_metrics_score_tuned(xgb_tuned2)
# Creating confusion matrix
make_confusion_matrix(xgb_tuned2, Y_test)
Accuracy on training set : 0.9351015801354402 Accuracy on test set : 0.9259624876604146 Recall on training set : 0.9885864793678666 Recall on test set : 0.9692622950819673 Precision on training set : 0.715829624920534 Precision on test set : 0.6925329428989752
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1))
# Parameter grid to pass in RandomizedSearchCV
param_grid = {
"adaboostclassifier__n_estimators": np.arange(10, 110, 10),
"adaboostclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
abc_tuned2 = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
abc_tuned2.fit(X_train,Y_train)
print("Best parameters are {} with CV score={}:" .format(abc_tuned2.best_params_,abc_tuned2.best_score_))
Best parameters are {'adaboostclassifier__n_estimators': 90, 'adaboostclassifier__learning_rate': 1, 'adaboostclassifier__base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1)} with CV score=0.8656812736687535:
CPU times: user 2min 10s, sys: 148 ms, total: 2min 10s
Wall time: 2min 10s
# Creating new pipeline with best parameters
abc_tuned2 = make_pipeline(
StandardScaler(),
AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(max_depth=2, random_state=1),
n_estimators=90,
learning_rate=1,
random_state=1,
),
)
# Fit the model on training data
abc_tuned2.fit(X_train, Y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('adaboostclassifier',
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=90,
random_state=1))])
# Calculating different metrics
get_metrics_score_tuned(abc_tuned2)
# Creating confusion matrix
make_confusion_matrix(abc_tuned2, Y_test)
Accuracy on training set : 0.9946388261851016 Accuracy on test set : 0.9743336623889437 Recall on training set : 0.9850746268656716 Recall on test set : 0.9098360655737705 Precision on training set : 0.9816272965879265 Precision on test set : 0.9288702928870293
%%time
# Creating pipeline
pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1))
# Parameter grid to pass in RandomizedSearchCV
param_grid = {
"gradientboostingclassifier__n_estimators": np.arange(10, 110, 10),
"gradientboostingclassifier__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
gbc_tuned2 = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
gbc_tuned2.fit(X_train,Y_train)
print("Best parameters are {} with CV score={}:" .format(gbc_tuned2.best_params_,gbc_tuned2.best_score_))
Best parameters are {'gradientboostingclassifier__n_estimators': 100, 'gradientboostingclassifier__learning_rate': 0.2} with CV score=0.8612991730427388:
CPU times: user 2min 36s, sys: 120 ms, total: 2min 36s
Wall time: 2min 36s
# Creating new pipeline with best parameters
gbc_tuned2 = make_pipeline(
StandardScaler(),
GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.2,
random_state=1,
),
)
# Fit the model on training data
gbc_tuned2.fit(X_train, Y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('gradientboostingclassifier',
GradientBoostingClassifier(learning_rate=0.2,
random_state=1))])
# Calculating different metrics
get_metrics_score_tuned(gbc_tuned2)
# Creating confusion matrix
make_confusion_matrix(gbc_tuned2, Y_test)
Accuracy on training set : 0.9863148984198645 Accuracy on test set : 0.9730174399473511 Recall on training set : 0.9438103599648815 Recall on test set : 0.8975409836065574 Precision on training set : 0.970216606498195 Precision on test set : 0.9319148936170213
# defining list of models
models = [abc_tuned1, abc_tuned2, gbc_tuned1, gbc_tuned2, xgb_tuned1, xgb_tuned2]
# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
# looping through all the models to get the metrics score - Accuracy, Recall and Precision
for model in models:
j = get_metrics_score_tuned(model, False)
acc_train.append(j[0])
acc_test.append(j[1])
recall_train.append(j[2])
recall_test.append(j[3])
precision_train.append(j[4])
precision_test.append(j[5])
comparison_frame = pd.DataFrame(
{
"Model": [
"Adaboost with GridSearchCV",
"Adaboost with RandomizedSearchCV",
"GradientBoost with GridSearchCV",
"GradientBoost with RandomizedSearchCV",
"XGBoost with GridSearchCV",
"XGBoost with RandomizedSearchCV",
],
"Train_Accuracy": acc_train,
"Test_Accuracy": acc_test,
"Train_Recall": recall_train,
"Test_Recall": recall_test,
"Train_Precision": precision_train,
"Test_Precision": precision_test,
}
)
# Sorting models in decreasing order of test recall
comparison_frame.sort_values(by="Test_Recall", ascending=False)
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | |
|---|---|---|---|---|---|---|---|
| 5 | XGBoost with RandomizedSearchCV | 0.935102 | 0.925962 | 0.988586 | 0.969262 | 0.715830 | 0.692533 |
| 4 | XGBoost with GridSearchCV | 0.988713 | 0.969727 | 1.000000 | 0.961066 | 0.934372 | 0.865314 |
| 1 | Adaboost with RandomizedSearchCV | 0.994639 | 0.974334 | 0.985075 | 0.909836 | 0.981627 | 0.928870 |
| 0 | Adaboost with GridSearchCV | 0.986174 | 0.972688 | 0.954346 | 0.897541 | 0.959400 | 0.929936 |
| 3 | GradientBoost with RandomizedSearchCV | 0.986315 | 0.973017 | 0.943810 | 0.897541 | 0.970217 | 0.931915 |
| 2 | GradientBoost with GridSearchCV | 0.962613 | 0.950971 | 0.856892 | 0.840164 | 0.905380 | 0.852391 |
feature_names = X_train.columns
importances = xgb_tuned1[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances XGB (Grid Search)")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
feature_names = X_train.columns
importances = xgb_tuned2[1].feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances XGB (Random Search)")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Bank should target customers who have low transaction count or low transaction amount in last 12 months, bank can reach out and do proactive surveys as well as try to see if they can offer customers loyalty points/discounts per transactions so that there is incentive for customers to use the Credit Card.
Bank should target customers who have a lower revolving balance indicating that customers are either not using the credit card too frequewntly or using it for small amount transactions. Again, bank can offer loyalty points for high amount transactions. Bank can also look at giving some incentive to customers to transfer balance of other credit card (possibly from another bank), so that customers have more at stake with the bank and are more involved.
Since most of the customers use "Blue" card predominantly, Bank should look at enhancing the customer base for its Gold, Platinum and Silver Credit Cards. Bank can offer signup bonus programs for these less preferred credit cards, may be offer travel points etc, so that these cards are lucarative to the customers.
Since most of the Bank customers are young people who are Graduates and make less than 40K per year, its seems that Bank needs to have more options for this clientale. Bank can offer credit limit in low range to attact more customers, as well as offer lower or no annual fee for certain time to attract more customers.
Since bank has a lot of College students and High Schoolers as customers, Bank can offer special discounts or points when the these customers (students) make purchase for Education purposes like buying electronics such as Laptop/ Computers etc, Online Education portals, Books etc, so that customers can use the credit card frequently easily.
Since most of the customer of bank fall under the category of Married Graduates, earning less than 40K per year and own the "Blue" credit card, the main aim of this category would be to make the most of the credit card, by either saving money when using this card, or assimilating points for their purchases. Bank should look at lucarative ways to engage, attract and retain such customers by giving them loyalty points for transactions, small insentives when the credit balance is paid early, so that there is a reason for the customers to stay. Bank can engage with merchants where their credit card is mostly used and see if they can collaborate for provide insentives to customers.